3  NYC Building Violation Analysis EDAV Project

Author

Shreya Shetty (svs2148) & Shruti Shetty (ss7592)

3.1 Loading Libraries

Code
# Libraries
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
Code
library(lubridate)

Attaching package: 'lubridate'
The following objects are masked from 'package:base':

    date, intersect, setdiff, union
Code
library(janitor)

Attaching package: 'janitor'
The following objects are masked from 'package:stats':

    chisq.test, fisher.test
Code
library(ggplot2)
library(ggalluvial)
library(tidyr)
library(scales)
library(data.table)

Attaching package: 'data.table'
The following objects are masked from 'package:lubridate':

    hour, isoweek, mday, minute, month, quarter, second, wday, week,
    yday, year
The following objects are masked from 'package:dplyr':

    between, first, last
Code
library(readr)

Attaching package: 'readr'
The following object is masked from 'package:scales':

    col_factor
Code
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats 1.0.1     ✔ stringr 1.6.0
✔ purrr   1.1.0     ✔ tibble  3.3.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ data.table::between() masks dplyr::between()
✖ readr::col_factor()   masks scales::col_factor()
✖ purrr::discard()      masks scales::discard()
✖ dplyr::filter()       masks stats::filter()
✖ data.table::first()   masks dplyr::first()
✖ data.table::hour()    masks lubridate::hour()
✖ data.table::isoweek() masks lubridate::isoweek()
✖ dplyr::lag()          masks stats::lag()
✖ data.table::last()    masks dplyr::last()
✖ data.table::mday()    masks lubridate::mday()
✖ data.table::minute()  masks lubridate::minute()
✖ data.table::month()   masks lubridate::month()
✖ data.table::quarter() masks lubridate::quarter()
✖ data.table::second()  masks lubridate::second()
✖ purrr::transpose()    masks data.table::transpose()
✖ data.table::wday()    masks lubridate::wday()
✖ data.table::week()    masks lubridate::week()
✖ data.table::yday()    masks lubridate::yday()
✖ data.table::year()    masks lubridate::year()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Code
library(forcats)
library(stringr)
library(redav)

3.2 Loading Housing Violations and 311 Complaints Datasets

Code
# ========== HOUSING VIOLATIONS ==========
housing_raw <- read.csv("/Users/shreyashetty/Documents/Fall 2025 Courses/EDAV/Final_Project/edav_project/datasets/Housing_Violations_2022_onwards.csv")

# ========== 311 HOUSING COMPLAINTS ==========
sr311_raw   <- read.csv("/Users/shreyashetty/Documents/Fall 2025 Courses/EDAV/Final_Project/edav_project/datasets/311_Housing_Complaints_2022_onwards.csv")


cat("Housing Violations:", nrow(housing_raw), "rows x", ncol(housing_raw), "columns\n")
Housing Violations: 589005 rows x 41 columns
Code
cat("311 Complaints:", nrow(sr311_raw ), "rows x", ncol(sr311_raw ), "columns\n")
311 Complaints: 425248 rows x 42 columns

3.3 Inspecting data and column names for both the datasets:

3.3.1 Housing Violations Dataset

Code
head(housing_raw)
  ViolationID BuildingID RegistrationID BoroID   Borough HouseNumber
1    15004789     805012         100420      1 MANHATTAN     144REAR
2    14817075       9458         104092      1 MANHATTAN           1
3    14831546      40441         118126      1 MANHATTAN         158
4    15313781      18016         120064      1 MANHATTAN         314
5    14838558       8157         118733      1 MANHATTAN        3885
6    14914309      43369         111345      1 MANHATTAN         502
  LowHouseNumber HighHouseNumber         StreetName StreetCode Postcode
1        144rear        146 REAR      LUDLOW STREET      25090    10002
2              1               7 CHRISTOPHER STREET      15210    10014
3            158             158    WEST 132 STREET      36410    10027
4            314             314     EAST 89 STREET      18770    10128
5           3885            3897           BROADWAY      13610    10032
6            502             502    WEST 180 STREET      37350    10033
  Apartment Story Block Lot Class InspectionDate ApprovedDate
1              NA   411  45     A     2022-03-08   03/08/2022
2        9C     9   610  60     A     2022-02-10   02/10/2022
3        3N     3  1916  55     B     2022-02-11   02/11/2022
4               0  1551  43     C     2022-08-16   08/17/2022
5       20B     2  2137  80     B     2022-02-14   02/15/2022
6         7     3  2152  45     B     2022-02-23   02/24/2022
  OriginalCertifyByDate OriginalCorrectByDate NewCertifyByDate NewCorrectByDate
1            06/25/2022            06/11/2022                                  
2            05/31/2022            05/17/2022                                  
3            04/04/2022            03/21/2022                                  
4            09/06/2022            09/01/2022                                  
5            04/06/2022            03/23/2022                                  
6            04/15/2022            04/01/2022                                  
  CertifiedDate OrderNumber   NOVID
1                      1507 7793231
2                       554 7636476
3                       579 7638268
4    12/05/2022         530 8007840
5    03/18/2022         579 7651705
6    03/18/2022        1503 7724697
                                                                                                                                                                                         NOVDescription
1 (A) § HMC:FILE ANNUAL BEDBUG REPORT IN ACCORDANCE WITH HPD RULE AS DESCRIBED ON THE BACK OF THIS NOTICE OF VIOLATION OR AS DESCRIBED ON HPDâ\u0080\u0099S WEBSITE, WWW.NYC.GOV\\HPD, SEARCH BED BUGS.
2                           § 27-2005 ADM CODE PAINT METAL IN ACCORDANCE WITH DEPT. REGULATION RADIATOR AT THE SOUTH WALL IN THE KITCHEN LOCATED AT APT 9C, 9th STORY, 3rd APARTMENT FROM EAST AT SOUTH
3                                                § 27-2026 ADM CODE REPAIR THE LEAKY AND/OR DEFECTIVE FAUCETS AT SINK IN THE KITCHENETTE LOCATED AT B-ROOM 3N, 3rd STORY, 1st B-ROOM FROM NORTH AT EAST
4                                                                                  § 27-2005, 27-2007, 27-2041.1 HMC: REPLACE OR REPAIR THE SELF-CLOSING DOORS THAT IS MISSING OR DEFECTIVE AT BULKHEAD
5                                             § 27-2026 ADM CODE REPAIR THE LEAKY AND/OR DEFECTIVE FAUCETS AT WASHBASIN IN THE BATHROOM LOCATED AT APT 20B, 2nd STORY, 1st APARTMENT FROM NORTH AT EAST
6                             § 27-2046.1 HMC: REPAIR OR REPLACE THE CARBON MONOXIDE DETECTING DEVICE(S). MISSING IN THE ENTIRE APARTMENT LOCATED AT APT 7, 3rd STORY, 1st APARTMENT FROM EAST AT SOUTH
  NOVIssuedDate CurrentStatusID       CurrentStatus CurrentStatusDate  NovType
1    03/08/2022               9 VIOLATION DISMISSED        03/23/2022 Original
2    02/11/2022              19    VIOLATION CLOSED        03/06/2024 Original
3    02/14/2022               2        NOV SENT OUT        02/14/2022 Original
4    08/18/2022              19    VIOLATION CLOSED        06/29/2023 Original
5    02/16/2022               9 VIOLATION DISMISSED        05/31/2022 Original
6    02/25/2022               9 VIOLATION DISMISSED        05/31/2022 Original
  ViolationStatus RentImpairing Latitude Longitude CommunityBoard
1           Close             N       NA        NA             NA
2           Close             N 40.73407 -73.99976              2
3            Open             N 40.81285 -73.94384             10
4           Close             N 40.77955 -73.94975              8
5           Close             N 40.83741 -73.94244             12
6           Close             N 40.84754 -73.93169             12
  CouncilDistrict CensusTract     BIN        BBL                        NTA
1              NA          NA      NA         NA                           
2               3          71 1010692 1006100060               West Village
3               9         226 1058117 1019160055             Harlem (North)
4               5       14602 1050093 1015510043  Upper East Side-Yorkville
5               7         245 1063363 1021370080 Washington Heights (South)
6              10         261 1063601 1021520045 Washington Heights (South)
Code
colnames(housing_raw)
 [1] "ViolationID"           "BuildingID"            "RegistrationID"       
 [4] "BoroID"                "Borough"               "HouseNumber"          
 [7] "LowHouseNumber"        "HighHouseNumber"       "StreetName"           
[10] "StreetCode"            "Postcode"              "Apartment"            
[13] "Story"                 "Block"                 "Lot"                  
[16] "Class"                 "InspectionDate"        "ApprovedDate"         
[19] "OriginalCertifyByDate" "OriginalCorrectByDate" "NewCertifyByDate"     
[22] "NewCorrectByDate"      "CertifiedDate"         "OrderNumber"          
[25] "NOVID"                 "NOVDescription"        "NOVIssuedDate"        
[28] "CurrentStatusID"       "CurrentStatus"         "CurrentStatusDate"    
[31] "NovType"               "ViolationStatus"       "RentImpairing"        
[34] "Latitude"              "Longitude"             "CommunityBoard"       
[37] "CouncilDistrict"       "CensusTract"           "BIN"                  
[40] "BBL"                   "NTA"                  
Code
summary(housing_raw)
  ViolationID         BuildingID      RegistrationID       BoroID 
 Min.   :14753195   Min.   :      1   Min.   :     0   Min.   :1  
 1st Qu.:15642804   1st Qu.:  13158   1st Qu.:107423   1st Qu.:1  
 Median :16530701   Median :  27948   Median :116840   Median :1  
 Mean   :16557793   Mean   : 101240   Mean   :120363   Mean   :1  
 3rd Qu.:17447307   3rd Qu.:  41552   3rd Qu.:129130   3rd Qu.:1  
 Max.   :18445551   Max.   :1017519   Max.   :670032   Max.   :1  
                                                                  
   Borough          HouseNumber        LowHouseNumber     HighHouseNumber   
 Length:589005      Length:589005      Length:589005      Length:589005     
 Class :character   Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character  
                                                                            
                                                                            
                                                                            
                                                                            
  StreetName          StreetCode       Postcode      Apartment        
 Length:589005      Min.   :    0   Min.   : 2019   Length:589005     
 Class :character   1st Qu.:17090   1st Qu.:10025   Class :character  
 Mode  :character   Median :28390   Median :10031   Mode  :character  
                    Mean   :25830   Mean   :10033                     
                    3rd Qu.:36210   3rd Qu.:10034                     
                    Max.   :72001   Max.   :10463                     
                                    NA's   :147                       
     Story             Block           Lot               Class          
 Min.   :  0.000   Min.   :    5   Length:589005      Length:589005     
 1st Qu.:  1.000   1st Qu.: 1441   Class :character   Class :character  
 Median :  3.000   Median : 1929   Mode  :character   Mode  :character  
 Mean   :  3.513   Mean   : 1685                                        
 3rd Qu.:  5.000   3rd Qu.: 2114                                        
 Max.   :964.000   Max.   :20094                                        
 NA's   :77094                                                          
 InspectionDate     ApprovedDate       OriginalCertifyByDate
 Length:589005      Length:589005      Length:589005        
 Class :character   Class :character   Class :character     
 Mode  :character   Mode  :character   Mode  :character     
                                                            
                                                            
                                                            
                                                            
 OriginalCorrectByDate NewCertifyByDate   NewCorrectByDate   CertifiedDate     
 Length:589005         Length:589005      Length:589005      Length:589005     
 Class :character      Class :character   Class :character   Class :character  
 Mode  :character      Mode  :character   Mode  :character   Mode  :character  
                                                                               
                                                                               
                                                                               
                                                                               
  OrderNumber         NOVID          NOVDescription     NOVIssuedDate     
 Min.   :   4.0   Min.   : 7594735   Length:589005      Length:589005     
 1st Qu.: 508.0   1st Qu.: 8266552   Class :character   Class :character  
 Median : 556.0   Median : 8878711   Mode  :character   Mode  :character  
 Mean   : 677.2   Mean   : 8867791                                        
 3rd Qu.: 672.0   3rd Qu.: 9466868                                        
 Max.   :1524.0   Max.   :10127752                                        
                  NA's   :34593                                           
 CurrentStatusID CurrentStatus      CurrentStatusDate    NovType         
 Min.   : 1.00   Length:589005      Length:589005      Length:589005     
 1st Qu.: 9.00   Class :character   Class :character   Class :character  
 Median : 9.00   Mode  :character   Mode  :character   Mode  :character  
 Mean   :12.87                                                           
 3rd Qu.:19.00                                                           
 Max.   :50.00                                                           
                                                                         
 ViolationStatus    RentImpairing         Latitude       Longitude     
 Length:589005      Length:589005      Min.   :40.70   Min.   :-74.02  
 Class :character   Class :character   1st Qu.:40.78   1st Qu.:-73.97  
 Mode  :character   Mode  :character   Median :40.81   Median :-73.95  
                                       Mean   :40.80   Mean   :-73.95  
                                       3rd Qu.:40.83   3rd Qu.:-73.94  
                                       Max.   :40.88   Max.   :-73.91  
                                       NA's   :77      NA's   :77      
 CommunityBoard   CouncilDistrict   CensusTract         BIN         
 Min.   : 1.000   Min.   : 1.000   Min.   :    6   Min.   :1000000  
 1st Qu.: 7.000   1st Qu.: 6.000   1st Qu.:  183   1st Qu.:1052051  
 Median :10.000   Median : 8.000   Median :  230   Median :1060493  
 Mean   : 8.877   Mean   : 7.215   Mean   : 2592   Mean   :1054353  
 3rd Qu.:12.000   3rd Qu.: 9.000   3rd Qu.:  279   3rd Qu.:1063870  
 Max.   :12.000   Max.   :17.000   Max.   :31704   Max.   :2000000  
 NA's   :77       NA's   :77       NA's   :77      NA's   :305      
      BBL                NTA           
 Min.   :0.000e+00   Length:589005     
 1st Qu.:1.014e+09   Class :character  
 Median :1.019e+09   Mode  :character  
 Mean   :1.017e+09                     
 3rd Qu.:1.021e+09                     
 Max.   :2.024e+09                     
 NA's   :305                           

3.3.2 311 Complaints dataset

Code
head(sr311_raw)
  Unique.Key           Created.Date Closed.Date Agency
1   66934900 11/23/2025 11:58:19 PM                HPD
2   66932786 11/23/2025 11:55:32 PM                HPD
3   66931601 11/23/2025 11:52:04 PM                HPD
4   66929338 11/23/2025 11:51:49 PM                HPD
5   66933739 11/23/2025 11:48:51 PM                HPD
6   66929209 11/23/2025 11:42:51 PM                HPD
                                         Agency.Name Complaint.Type
1 Department of Housing Preservation and Development HEAT/HOT WATER
2 Department of Housing Preservation and Development       PLUMBING
3 Department of Housing Preservation and Development HEAT/HOT WATER
4 Department of Housing Preservation and Development HEAT/HOT WATER
5 Department of Housing Preservation and Development HEAT/HOT WATER
6 Department of Housing Preservation and Development HEAT/HOT WATER
       Descriptor        Location.Type Incident.Zip     Incident.Address
1 ENTIRE BUILDING RESIDENTIAL BUILDING        10028 425 EAST   84 STREET
2  BATHTUB/SHOWER RESIDENTIAL BUILDING        10040 121 FT GEORGE AVENUE
3 ENTIRE BUILDING RESIDENTIAL BUILDING        10040 121 FT GEORGE AVENUE
4 ENTIRE BUILDING RESIDENTIAL BUILDING        10028 425 EAST   84 STREET
5 ENTIRE BUILDING RESIDENTIAL BUILDING        10025 242 WEST  109 STREET
6  APARTMENT ONLY RESIDENTIAL BUILDING        10009          26 1 AVENUE
       Street.Name Cross.Street.1 Cross.Street.2 Intersection.Street.1
1 EAST   84 STREET                                                    
2 FT GEORGE AVENUE                                                    
3 FT GEORGE AVENUE                                                    
4 EAST   84 STREET                                                    
5 WEST  109 STREET                                                    
6         1 AVENUE                                                    
  Intersection.Street.2 Address.Type     City Landmark Facility.Type Status
1                            ADDRESS NEW YORK                          Open
2                            ADDRESS NEW YORK                          Open
3                            ADDRESS NEW YORK                          Open
4                            ADDRESS NEW YORK                          Open
5                            ADDRESS NEW YORK                          Open
6                            ADDRESS NEW YORK                          Open
  Due.Date
1       NA
2       NA
3       NA
4       NA
5       NA
6       NA
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               Resolution.Description
1 This complaint is a duplicate of a building-wide condition already reported by another tenant.  The original complaint is still open, and HPD may only need to confirm that the condition exists by inspecting one apartment.  If we cannot contact the tenant from the original complaint or get access to that apartment, HPD may attempt to contact the person who filed this complaint to verify the correction of the condition or may conduct an inspection of your unit. You can check HPDONLINE to see if a
2                                                                                                                                                         The following complaint conditions are still open. HPD has already attempted to notify the property owner that the condition exists; the tenant should provide access for the owner to make the repair.  HPD may attempt to contact the tenant by phone to verify the correction of the condition or an HPD Inspector may attempt to conduct an inspection.
3 This complaint is a duplicate of a building-wide condition already reported by another tenant.  The original complaint is still open, and HPD may only need to confirm that the condition exists by inspecting one apartment.  If we cannot contact the tenant from the original complaint or get access to that apartment, HPD may attempt to contact the person who filed this complaint to verify the correction of the condition or may conduct an inspection of your unit. You can check HPDONLINE to see if a
4                                                                                                                                                         The following complaint conditions are still open. HPD has already attempted to notify the property owner that the condition exists; the tenant should provide access for the owner to make the repair.  HPD may attempt to contact the tenant by phone to verify the correction of the condition or an HPD Inspector may attempt to conduct an inspection.
5 This complaint is a duplicate of a building-wide condition already reported by another tenant.  The original complaint is still open, and HPD may only need to confirm that the condition exists by inspecting one apartment.  If we cannot contact the tenant from the original complaint or get access to that apartment, HPD may attempt to contact the person who filed this complaint to verify the correction of the condition or may conduct an inspection of your unit. You can check HPDONLINE to see if a
6                                                                                                                                                         The following complaint conditions are still open. HPD has already attempted to notify the property owner that the condition exists; the tenant should provide access for the owner to make the repair.  HPD may attempt to contact the tenant by phone to verify the correction of the condition or an HPD Inspector may attempt to conduct an inspection.
  Resolution.Action.Updated.Date Community.Board        BBL   Borough
1         11/23/2025 12:00:00 AM    08 MANHATTAN 1015640012 MANHATTAN
2         11/23/2025 12:00:00 AM    12 MANHATTAN 1021490265 MANHATTAN
3         11/23/2025 12:00:00 AM    12 MANHATTAN 1021490265 MANHATTAN
4         11/23/2025 12:00:00 AM    08 MANHATTAN 1015640012 MANHATTAN
5         11/23/2025 12:00:00 AM    07 MANHATTAN 1018800053 MANHATTAN
6         11/23/2025 12:00:00 AM    03 MANHATTAN 1004290009 MANHATTAN
  X.Coordinate..State.Plane. Y.Coordinate..State.Plane. Open.Data.Channel.Type
1                    998,207                    221,730                 ONLINE
2                  1,004,446                    251,683                  PHONE
3                  1,004,446                    251,683                  PHONE
4                    998,207                    221,730                 ONLINE
5                    993,732                    231,824                  PHONE
6                    987,565                    202,976                  PHONE
  Park.Facility.Name Park.Borough Vehicle.Type Taxi.Company.Borough
1        Unspecified    MANHATTAN           NA                   NA
2        Unspecified    MANHATTAN           NA                   NA
3        Unspecified    MANHATTAN           NA                   NA
4        Unspecified    MANHATTAN           NA                   NA
5        Unspecified    MANHATTAN           NA                   NA
6        Unspecified    MANHATTAN           NA                   NA
  Taxi.Pick.Up.Location Bridge.Highway.Name Bridge.Highway.Direction Road.Ramp
1                    NA                  NA                       NA        NA
2                    NA                  NA                       NA        NA
3                    NA                  NA                       NA        NA
4                    NA                  NA                       NA        NA
5                    NA                  NA                       NA        NA
6                    NA                  NA                       NA        NA
  Bridge.Highway.Segment Latitude Longitude
1                     NA 40.77526 -73.94961
2                     NA 40.85746 -73.92699
3                     NA 40.85746 -73.92699
4                     NA 40.77526 -73.94961
5                     NA 40.80297 -73.96575
6                     NA 40.72380 -73.98804
                                 Location CreatedDate
1 (40.77526292188198, -73.94960785720805)  2025-11-23
2 (40.85746342279737, -73.92699164532567)  2025-11-23
3 (40.85746342279737, -73.92699164532567)  2025-11-23
4 (40.77526292188198, -73.94960785720805)  2025-11-23
5 (40.80297417373674, -73.96575071974307)  2025-11-23
6 (40.72379831989205, -73.98804034230925)  2025-11-23
Code
colnames(sr311_raw)
 [1] "Unique.Key"                     "Created.Date"                  
 [3] "Closed.Date"                    "Agency"                        
 [5] "Agency.Name"                    "Complaint.Type"                
 [7] "Descriptor"                     "Location.Type"                 
 [9] "Incident.Zip"                   "Incident.Address"              
[11] "Street.Name"                    "Cross.Street.1"                
[13] "Cross.Street.2"                 "Intersection.Street.1"         
[15] "Intersection.Street.2"          "Address.Type"                  
[17] "City"                           "Landmark"                      
[19] "Facility.Type"                  "Status"                        
[21] "Due.Date"                       "Resolution.Description"        
[23] "Resolution.Action.Updated.Date" "Community.Board"               
[25] "BBL"                            "Borough"                       
[27] "X.Coordinate..State.Plane."     "Y.Coordinate..State.Plane."    
[29] "Open.Data.Channel.Type"         "Park.Facility.Name"            
[31] "Park.Borough"                   "Vehicle.Type"                  
[33] "Taxi.Company.Borough"           "Taxi.Pick.Up.Location"         
[35] "Bridge.Highway.Name"            "Bridge.Highway.Direction"      
[37] "Road.Ramp"                      "Bridge.Highway.Segment"        
[39] "Latitude"                       "Longitude"                     
[41] "Location"                       "CreatedDate"                   
Code
summary(sr311_raw)
   Unique.Key       Created.Date       Closed.Date           Agency         
 Min.   :52939155   Length:425248      Length:425248      Length:425248     
 1st Qu.:56470870   Class :character   Class :character   Class :character  
 Median :59938206   Mode  :character   Mode  :character   Mode  :character  
 Mean   :59892593                                                           
 3rd Qu.:63464249                                                           
 Max.   :66935006                                                           
                                                                            
 Agency.Name        Complaint.Type      Descriptor        Location.Type     
 Length:425248      Length:425248      Length:425248      Length:425248     
 Class :character   Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character  
                                                                            
                                                                            
                                                                            
                                                                            
  Incident.Zip   Incident.Address   Street.Name        Cross.Street.1    
 Min.   :10000   Length:425248      Length:425248      Length:425248     
 1st Qu.:10024   Class :character   Class :character   Class :character  
 Median :10030   Mode  :character   Mode  :character   Mode  :character  
 Mean   :10032                                                           
 3rd Qu.:10034                                                           
 Max.   :12345                                                           
 NA's   :1                                                               
 Cross.Street.2     Intersection.Street.1 Intersection.Street.2
 Length:425248      Length:425248         Length:425248        
 Class :character   Class :character      Class :character     
 Mode  :character   Mode  :character      Mode  :character     
                                                               
                                                               
                                                               
                                                               
 Address.Type           City             Landmark         Facility.Type     
 Length:425248      Length:425248      Length:425248      Length:425248     
 Class :character   Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character  
                                                                            
                                                                            
                                                                            
                                                                            
    Status          Due.Date       Resolution.Description
 Length:425248      Mode:logical   Length:425248         
 Class :character   NA's:425248    Class :character      
 Mode  :character                  Mode  :character      
                                                         
                                                         
                                                         
                                                         
 Resolution.Action.Updated.Date Community.Board         BBL           
 Length:425248                  Length:425248      Min.   :0.000e+00  
 Class :character               Class :character   1st Qu.:1.012e+09  
 Mode  :character               Mode  :character   Median :1.019e+09  
                                                   Mean   :1.016e+09  
                                                   3rd Qu.:1.021e+09  
                                                   Max.   :1.240e+09  
                                                   NA's   :2302       
   Borough          X.Coordinate..State.Plane. Y.Coordinate..State.Plane.
 Length:425248      Length:425248              Length:425248             
 Class :character   Class :character           Class :character          
 Mode  :character   Mode  :character           Mode  :character          
                                                                         
                                                                         
                                                                         
                                                                         
 Open.Data.Channel.Type Park.Facility.Name Park.Borough       Vehicle.Type  
 Length:425248          Length:425248      Length:425248      Mode:logical  
 Class :character       Class :character   Class :character   NA's:425248   
 Mode  :character       Mode  :character   Mode  :character                 
                                                                            
                                                                            
                                                                            
                                                                            
 Taxi.Company.Borough Taxi.Pick.Up.Location Bridge.Highway.Name
 Mode:logical         Mode:logical          Mode:logical       
 NA's:425248          NA's:425248           NA's:425248        
                                                               
                                                               
                                                               
                                                               
                                                               
 Bridge.Highway.Direction Road.Ramp      Bridge.Highway.Segment    Latitude    
 Mode:logical             Mode:logical   Mode:logical           Min.   :40.69  
 NA's:425248              NA's:425248    NA's:425248            1st Qu.:40.77  
                                                                Median :40.81  
                                                                Mean   :40.80  
                                                                3rd Qu.:40.83  
                                                                Max.   :40.88  
                                                                               
   Longitude        Location         CreatedDate       
 Min.   :-74.02   Length:425248      Length:425248     
 1st Qu.:-73.97   Class :character   Class :character  
 Median :-73.95   Mode  :character   Mode  :character  
 Mean   :-73.96                                        
 3rd Qu.:-73.94                                        
 Max.   :-73.91                                        
                                                       

3.4 Checking Key distributions

Code
# ========== KEY DISTRIBUTIONS ==========

# Use tidyverse counting (no data.table .N)

housing_copy <- housing_raw
sr311_copy <- sr311_raw

# Housing Violations - Top 10 Current Status (showing top 10 since there are many unique values)

housing_copy |>
  filter(!is.na(CurrentStatus)) |>
    count(CurrentStatus, sort = TRUE) |>
      slice_head(n = 10) |>
        print()
                              CurrentStatus      n
1                       VIOLATION DISMISSED 197906
2                          VIOLATION CLOSED 180829
3                              NOV SENT OUT 112090
4  FIRST NO ACCESS TO RE- INSPECT VIOLATION  18629
5                         NOT COMPLIED WITH  17976
6                         INFO NOV SENT OUT  12927
7         NOTICE OF ISSUANCE SENT TO TENANT  11160
8             VIOLATION WILL BE REINSPECTED  10263
9                              CIV14 MAILED   7244
10                     DEFECT LETTER ISSUED   5934
Code
# 311 Complaints - Top 10 Complaint Types (showing top 10)

sr311_copy |>
  filter(!is.na(Complaint.Type)) |>
    count(Complaint.Type, sort = TRUE) |>
      slice_head(n = 10) |>
        print()
                  Complaint.Type      n
1                 HEAT/HOT WATER 229222
2                       PLUMBING  50637
3                  PAINT/PLASTER  44624
4                     WATER LEAK  30190
5  General Construction/Plumbing  25207
6        Maintenance or Facility  20202
7                       Elevator  16588
8                       Plumbing   2197
9                       ELEVATOR   1302
10          Non-Residential Heat   1094
Code
# 311 Complaints - Status distribution

sr311_copy |>
  filter(!is.na(Status)) |>
    count(Status, sort = TRUE) |>
      print()
       Status      n
1      Closed 419047
2        Open   5152
3 In Progress    922
4    Assigned    124
5 Unspecified      3
Code
# 311 Complaints - Top Agencies

sr311_copy |>
  filter(!is.na(Agency)) |>
    count(Agency, sort = TRUE) |>
      slice_head(n = 10) |>
        print()
  Agency      n
1    HPD 358603
2    DOB  43687
3    DPR  20202
4  DOHMH   1579
5    DOE   1064
6    DEP     99
7    DOT     14

3.5 Formatting dates and making sure loaded data has only Manhattan Borough entries

Here, we created a robust set of parse orders for mixed formats in both our datasets. We created cleaned date columns while keep original columns untouched. Also, we added new columns with month-year (Month name + Year) extracted which will be used for alluvial stacks later on

Code
# VALID DATE ORDERS

date_orders <- c("mdY HMS", "mdY HM", "mdY", "Ymd HMS", "Ymd")

housing <- housing_raw |>
  mutate(
    borough_std = toupper(Borough),
    # PARSE master versions — keep duplicates untouched
    inspectiondate_clean = parse_date_time(InspectionDate, orders = date_orders),
    approveddate_clean   = parse_date_time(ApprovedDate, orders = date_orders),
    certifieddate_clean  = parse_date_time(CertifiedDate, orders = date_orders),
    novissued_clean      = parse_date_time(NOVIssuedDate, orders = date_orders),
    statusdate_clean     = parse_date_time(CurrentStatusDate, orders = date_orders),
    originalcertifybydate_clean = parse_date_time(OriginalCertifyByDate, orders = date_orders),
    originalcorrectbydate_clean = parse_date_time(OriginalCorrectByDate, orders = date_orders),
    newcertifybydate_clean = parse_date_time(NewCertifyByDate, orders = date_orders),
    newcorrectbydate_cean = parse_date_time(NewCorrectByDate, orders = date_orders),
  ) |>
  filter(borough_std == "MANHATTAN")

sr311 <- sr311_raw |>
  mutate(
    borough_std = toupper(Borough),
    created_clean = parse_date_time(Created.Date, orders = date_orders),
    closed_clean  = parse_date_time(Closed.Date, orders = date_orders)
  ) |>
  filter(borough_std == "MANHATTAN")


# adding month-year (Month name + Year) columns used for alluvial stacks

housing <- housing |>
  mutate(
    inspection_month = ifelse(!is.na(inspectiondate_clean),
                              format(floor_date(inspectiondate_clean, "month"), "%B %Y"),
                              NA_character_),
    approved_month = ifelse(!is.na(approveddate_clean), 
                            format(floor_date(approveddate_clean,"month"), "%B %Y"),
                            NA_character_),
    nov_month = ifelse(!is.na(novissued_clean),
                       format(floor_date(novissued_clean, "month"), "%B %Y"),
                       NA_character_),
    status_month = ifelse(!is.na(statusdate_clean),
                          format(floor_date(statusdate_clean, "month"), "%B %Y"),
                          NA_character_)
  )

sr311 <- sr311 |>
  mutate(
    created_month = ifelse(!is.na(created_clean),
                           format(floor_date(created_clean, "month"), "%B %Y"),
                           NA_character_),
    closed_month = ifelse(!is.na(closed_clean),
                           format(floor_date(closed_clean, "month"), "%B %Y"),
                           NA_character_)
)

cat("Final Housing rows:", nrow(housing), "\n")
Final Housing rows: 589005 
Code
cat("Final 311 rows:", nrow(sr311), "\n")
Final 311 rows: 425248 

The date columns have been parsed properly and as we can see the both raw data we downloaded, only had data from the Manhattan borough since before and after filtering the number of rows remain same for both datastes i.e. 589005 rows in Housing Violations dataset and 425248 rows in 311 requests dataset.

3.5.1 Checking 10 rows of data with new date columns included

Code
head(housing,10)
   ViolationID BuildingID RegistrationID BoroID   Borough HouseNumber
1     15004789     805012         100420      1 MANHATTAN     144REAR
2     14817075       9458         104092      1 MANHATTAN           1
3     14831546      40441         118126      1 MANHATTAN         158
4     15313781      18016         120064      1 MANHATTAN         314
5     14838558       8157         118733      1 MANHATTAN        3885
6     14914309      43369         111345      1 MANHATTAN         502
7     15349038      38326         107467      1 MANHATTAN         143
8     15097146      42830         105422      1 MANHATTAN         542
9     15097147      42830         105422      1 MANHATTAN         542
10    15099194       3423         125427      1 MANHATTAN         201
   LowHouseNumber HighHouseNumber         StreetName StreetCode Postcode
1         144rear        146 REAR      LUDLOW STREET      25090    10002
2               1               7 CHRISTOPHER STREET      15210    10014
3             158             158    WEST 132 STREET      36410    10027
4             314             314     EAST 89 STREET      18770    10128
5            3885            3897           BROADWAY      13610    10032
6             502             502    WEST 180 STREET      37350    10033
7             143             145    WEST 116 STREET      36090    10026
8             542             544    WEST 159 STREET      36950    10032
9             542             544    WEST 159 STREET      36950    10032
10            201             201    WEST 120 STREET      10610    10027
   Apartment Story Block Lot Class InspectionDate ApprovedDate
1               NA   411  45     A     2022-03-08   03/08/2022
2         9C     9   610  60     A     2022-02-10   02/10/2022
3         3N     3  1916  55     B     2022-02-11   02/11/2022
4                0  1551  43     C     2022-08-16   08/17/2022
5        20B     2  2137  80     B     2022-02-14   02/15/2022
6          7     3  2152  45     B     2022-02-23   02/24/2022
7         2A     2  1901   9     B     2022-08-30   08/31/2022
8         5C     5  2117  14     B     2022-04-09   04/09/2022
9         5C     5  2117  14     B     2022-04-09   04/09/2022
10        1A     1  1926  29     B     2022-04-12   04/12/2022
   OriginalCertifyByDate OriginalCorrectByDate NewCertifyByDate
1             06/25/2022            06/11/2022                 
2             05/31/2022            05/17/2022                 
3             04/04/2022            03/21/2022                 
4             09/06/2022            09/01/2022                 
5             04/06/2022            03/23/2022                 
6             04/15/2022            04/01/2022                 
7             10/20/2022            10/06/2022                 
8             05/30/2022            05/16/2022                 
9             05/30/2022            05/16/2022                 
10            06/01/2022            05/18/2022                 
   NewCorrectByDate CertifiedDate OrderNumber   NOVID
1                                        1507 7793231
2                                         554 7636476
3                                         579 7638268
4                      12/05/2022         530 8007840
5                      03/18/2022         579 7651705
6                      03/18/2022        1503 7724697
7                      09/30/2022         702 8023066
8                      05/30/2022        1503 7870389
9                      05/30/2022         702 7870389
10                     05/30/2022         508 7873344
                                                                                                                                                                                                                           NOVDescription
1                                   (A) § HMC:FILE ANNUAL BEDBUG REPORT IN ACCORDANCE WITH HPD RULE AS DESCRIBED ON THE BACK OF THIS NOTICE OF VIOLATION OR AS DESCRIBED ON HPDâ\u0080\u0099S WEBSITE, WWW.NYC.GOV\\HPD, SEARCH BED BUGS.
2                                                             § 27-2005 ADM CODE PAINT METAL IN ACCORDANCE WITH DEPT. REGULATION RADIATOR AT THE SOUTH WALL IN THE KITCHEN LOCATED AT APT 9C, 9th STORY, 3rd APARTMENT FROM EAST AT SOUTH
3                                                                                  § 27-2026 ADM CODE REPAIR THE LEAKY AND/OR DEFECTIVE FAUCETS AT SINK IN THE KITCHENETTE LOCATED AT B-ROOM 3N, 3rd STORY, 1st B-ROOM FROM NORTH AT EAST
4                                                                                                                    § 27-2005, 27-2007, 27-2041.1 HMC: REPLACE OR REPAIR THE SELF-CLOSING DOORS THAT IS MISSING OR DEFECTIVE AT BULKHEAD
5                                                                               § 27-2026 ADM CODE REPAIR THE LEAKY AND/OR DEFECTIVE FAUCETS AT WASHBASIN IN THE BATHROOM LOCATED AT APT 20B, 2nd STORY, 1st APARTMENT FROM NORTH AT EAST
6                                                               § 27-2046.1 HMC: REPAIR OR REPLACE THE CARBON MONOXIDE DETECTING DEVICE(S). MISSING IN THE ENTIRE APARTMENT LOCATED AT APT 7, 3rd STORY, 1st APARTMENT FROM EAST AT SOUTH
7                                                                                                          § 27-2045 ADM CODE REPAIR OR REPLACE THE SMOKE DETECTOR MISSING LOCATED AT APT 2A, 2nd STORY, 1st APARTMENT FROM WEST AT NORTH
8                                                              § 27-2046.1 HMC: REPAIR OR REPLACE THE CARBON MONOXIDE DETECTING DEVICE(S). MISSING IN THE ENTIRE APARTMENT LOCATED AT APT 5C, 5th STORY, 1st APARTMENT FROM WEST AT NORTH
9                                                                                  § 27-2045 ADM CODE REPAIR OR REPLACE THE SMOKE DETECTOR MISSING IN THE ENTIRE APARTMENT LOCATED AT APT 5C, 5th STORY, 1st APARTMENT FROM WEST AT NORTH
10 § 27-2005 ADM CODE REPAIR THE BROKEN OR DEFECTIVE PLASTERED SURFACES AND PAINT IN A UNIFORM COLOR EAST WALL APPROX 4SQ FT AND NORTH WALL APPROX 4 SQ FT IN THE BATHROOM LOCATED AT APT 1A, 1st STORY, 2nd APARTMENT FROM SOUTH AT WEST
   NOVIssuedDate CurrentStatusID       CurrentStatus CurrentStatusDate  NovType
1     03/08/2022               9 VIOLATION DISMISSED        03/23/2022 Original
2     02/11/2022              19    VIOLATION CLOSED        03/06/2024 Original
3     02/14/2022               2        NOV SENT OUT        02/14/2022 Original
4     08/18/2022              19    VIOLATION CLOSED        06/29/2023 Original
5     02/16/2022               9 VIOLATION DISMISSED        05/31/2022 Original
6     02/25/2022               9 VIOLATION DISMISSED        05/31/2022 Original
7     09/01/2022               9 VIOLATION DISMISSED        12/12/2022 Original
8     04/11/2022              19    VIOLATION CLOSED        07/03/2022 Original
9     04/11/2022              19    VIOLATION CLOSED        07/03/2022 Original
10    04/13/2022               9 VIOLATION DISMISSED        08/10/2022 Original
   ViolationStatus RentImpairing Latitude Longitude CommunityBoard
1            Close             N       NA        NA             NA
2            Close             N 40.73407 -73.99976              2
3             Open             N 40.81285 -73.94384             10
4            Close             N 40.77955 -73.94975              8
5            Close             N 40.83741 -73.94244             12
6            Close             N 40.84754 -73.93169             12
7            Close             N 40.80257 -73.95095             10
8            Close             N 40.83461 -73.94233             12
9            Close             N 40.83461 -73.94233             12
10           Close             N 40.80592 -73.95095             10
   CouncilDistrict CensusTract     BIN        BBL                        NTA
1               NA          NA      NA         NA                           
2                3          71 1010692 1006100060               West Village
3                9         226 1058117 1019160055             Harlem (North)
4                5       14602 1050093 1015510043  Upper East Side-Yorkville
5                7         245 1063363 1021370080 Washington Heights (South)
6               10         261 1063601 1021520045 Washington Heights (South)
7                9         218 1057394 1019010009             Harlem (South)
8                7         245 1062750 1021170014 Washington Heights (South)
9                7         245 1062750 1021170014 Washington Heights (South)
10               9         220 1079501 1019260029             Harlem (South)
   borough_std inspectiondate_clean approveddate_clean certifieddate_clean
1    MANHATTAN           2022-03-08         2022-03-08                <NA>
2    MANHATTAN           2022-02-10         2022-02-10                <NA>
3    MANHATTAN           2022-02-11         2022-02-11                <NA>
4    MANHATTAN           2022-08-16         2022-08-17          2022-12-05
5    MANHATTAN           2022-02-14         2022-02-15          2022-03-18
6    MANHATTAN           2022-02-23         2022-02-24          2022-03-18
7    MANHATTAN           2022-08-30         2022-08-31          2022-09-30
8    MANHATTAN           2022-04-09         2022-04-09          2022-05-30
9    MANHATTAN           2022-04-09         2022-04-09          2022-05-30
10   MANHATTAN           2022-04-12         2022-04-12          2022-05-30
   novissued_clean statusdate_clean originalcertifybydate_clean
1       2022-03-08       2022-03-23                  2022-06-25
2       2022-02-11       2024-03-06                  2022-05-31
3       2022-02-14       2022-02-14                  2022-04-04
4       2022-08-18       2023-06-29                  2022-09-06
5       2022-02-16       2022-05-31                  2022-04-06
6       2022-02-25       2022-05-31                  2022-04-15
7       2022-09-01       2022-12-12                  2022-10-20
8       2022-04-11       2022-07-03                  2022-05-30
9       2022-04-11       2022-07-03                  2022-05-30
10      2022-04-13       2022-08-10                  2022-06-01
   originalcorrectbydate_clean newcertifybydate_clean newcorrectbydate_cean
1                   2022-06-11                   <NA>                  <NA>
2                   2022-05-17                   <NA>                  <NA>
3                   2022-03-21                   <NA>                  <NA>
4                   2022-09-01                   <NA>                  <NA>
5                   2022-03-23                   <NA>                  <NA>
6                   2022-04-01                   <NA>                  <NA>
7                   2022-10-06                   <NA>                  <NA>
8                   2022-05-16                   <NA>                  <NA>
9                   2022-05-16                   <NA>                  <NA>
10                  2022-05-18                   <NA>                  <NA>
   inspection_month approved_month      nov_month  status_month
1        March 2022     March 2022     March 2022    March 2022
2     February 2022  February 2022  February 2022    March 2024
3     February 2022  February 2022  February 2022 February 2022
4       August 2022    August 2022    August 2022     June 2023
5     February 2022  February 2022  February 2022      May 2022
6     February 2022  February 2022  February 2022      May 2022
7       August 2022    August 2022 September 2022 December 2022
8        April 2022     April 2022     April 2022     July 2022
9        April 2022     April 2022     April 2022     July 2022
10       April 2022     April 2022     April 2022   August 2022
Code
head(sr311,10)
   Unique.Key           Created.Date Closed.Date Agency
1    66934900 11/23/2025 11:58:19 PM                HPD
2    66932786 11/23/2025 11:55:32 PM                HPD
3    66931601 11/23/2025 11:52:04 PM                HPD
4    66929338 11/23/2025 11:51:49 PM                HPD
5    66933739 11/23/2025 11:48:51 PM                HPD
6    66929209 11/23/2025 11:42:51 PM                HPD
7    66933740 11/23/2025 11:39:54 PM                HPD
8    66931510 11/23/2025 11:39:46 PM                HPD
9    66929270 11/23/2025 11:36:49 PM                HPD
10   66931511 11/23/2025 11:34:10 PM                HPD
                                          Agency.Name Complaint.Type
1  Department of Housing Preservation and Development HEAT/HOT WATER
2  Department of Housing Preservation and Development       PLUMBING
3  Department of Housing Preservation and Development HEAT/HOT WATER
4  Department of Housing Preservation and Development HEAT/HOT WATER
5  Department of Housing Preservation and Development HEAT/HOT WATER
6  Department of Housing Preservation and Development HEAT/HOT WATER
7  Department of Housing Preservation and Development HEAT/HOT WATER
8  Department of Housing Preservation and Development HEAT/HOT WATER
9  Department of Housing Preservation and Development HEAT/HOT WATER
10 Department of Housing Preservation and Development HEAT/HOT WATER
        Descriptor        Location.Type Incident.Zip      Incident.Address
1  ENTIRE BUILDING RESIDENTIAL BUILDING        10028  425 EAST   84 STREET
2   BATHTUB/SHOWER RESIDENTIAL BUILDING        10040  121 FT GEORGE AVENUE
3  ENTIRE BUILDING RESIDENTIAL BUILDING        10040  121 FT GEORGE AVENUE
4  ENTIRE BUILDING RESIDENTIAL BUILDING        10028  425 EAST   84 STREET
5  ENTIRE BUILDING RESIDENTIAL BUILDING        10025  242 WEST  109 STREET
6   APARTMENT ONLY RESIDENTIAL BUILDING        10009           26 1 AVENUE
7  ENTIRE BUILDING RESIDENTIAL BUILDING        10034  106 VERMILYEA AVENUE
8  ENTIRE BUILDING RESIDENTIAL BUILDING        10024   20 WEST   86 STREET
9  ENTIRE BUILDING RESIDENTIAL BUILDING        10031  547 WEST  147 STREET
10 ENTIRE BUILDING RESIDENTIAL BUILDING        10032 2166 AMSTERDAM AVENUE
        Street.Name Cross.Street.1 Cross.Street.2 Intersection.Street.1
1  EAST   84 STREET                                                    
2  FT GEORGE AVENUE                                                    
3  FT GEORGE AVENUE                                                    
4  EAST   84 STREET                                                    
5  WEST  109 STREET                                                    
6          1 AVENUE                                                    
7  VERMILYEA AVENUE                                                    
8  WEST   86 STREET                                                    
9  WEST  147 STREET                                                    
10 AMSTERDAM AVENUE                                                    
   Intersection.Street.2 Address.Type     City Landmark Facility.Type Status
1                             ADDRESS NEW YORK                          Open
2                             ADDRESS NEW YORK                          Open
3                             ADDRESS NEW YORK                          Open
4                             ADDRESS NEW YORK                          Open
5                             ADDRESS NEW YORK                          Open
6                             ADDRESS NEW YORK                          Open
7                             ADDRESS NEW YORK                          Open
8                             ADDRESS NEW YORK                          Open
9                             ADDRESS NEW YORK                          Open
10                            ADDRESS NEW YORK                          Open
   Due.Date
1        NA
2        NA
3        NA
4        NA
5        NA
6        NA
7        NA
8        NA
9        NA
10       NA
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                Resolution.Description
1  This complaint is a duplicate of a building-wide condition already reported by another tenant.  The original complaint is still open, and HPD may only need to confirm that the condition exists by inspecting one apartment.  If we cannot contact the tenant from the original complaint or get access to that apartment, HPD may attempt to contact the person who filed this complaint to verify the correction of the condition or may conduct an inspection of your unit. You can check HPDONLINE to see if a
2                                                                                                                                                          The following complaint conditions are still open. HPD has already attempted to notify the property owner that the condition exists; the tenant should provide access for the owner to make the repair.  HPD may attempt to contact the tenant by phone to verify the correction of the condition or an HPD Inspector may attempt to conduct an inspection.
3  This complaint is a duplicate of a building-wide condition already reported by another tenant.  The original complaint is still open, and HPD may only need to confirm that the condition exists by inspecting one apartment.  If we cannot contact the tenant from the original complaint or get access to that apartment, HPD may attempt to contact the person who filed this complaint to verify the correction of the condition or may conduct an inspection of your unit. You can check HPDONLINE to see if a
4                                                                                                                                                          The following complaint conditions are still open. HPD has already attempted to notify the property owner that the condition exists; the tenant should provide access for the owner to make the repair.  HPD may attempt to contact the tenant by phone to verify the correction of the condition or an HPD Inspector may attempt to conduct an inspection.
5  This complaint is a duplicate of a building-wide condition already reported by another tenant.  The original complaint is still open, and HPD may only need to confirm that the condition exists by inspecting one apartment.  If we cannot contact the tenant from the original complaint or get access to that apartment, HPD may attempt to contact the person who filed this complaint to verify the correction of the condition or may conduct an inspection of your unit. You can check HPDONLINE to see if a
6                                                                                                                                                          The following complaint conditions are still open. HPD has already attempted to notify the property owner that the condition exists; the tenant should provide access for the owner to make the repair.  HPD may attempt to contact the tenant by phone to verify the correction of the condition or an HPD Inspector may attempt to conduct an inspection.
7                                                                                                                                                          The following complaint conditions are still open. HPD has already attempted to notify the property owner that the condition exists; the tenant should provide access for the owner to make the repair.  HPD may attempt to contact the tenant by phone to verify the correction of the condition or an HPD Inspector may attempt to conduct an inspection.
8  This complaint is a duplicate of a building-wide condition already reported by another tenant.  The original complaint is still open, and HPD may only need to confirm that the condition exists by inspecting one apartment.  If we cannot contact the tenant from the original complaint or get access to that apartment, HPD may attempt to contact the person who filed this complaint to verify the correction of the condition or may conduct an inspection of your unit. You can check HPDONLINE to see if a
9  This complaint is a duplicate of a building-wide condition already reported by another tenant.  The original complaint is still open, and HPD may only need to confirm that the condition exists by inspecting one apartment.  If we cannot contact the tenant from the original complaint or get access to that apartment, HPD may attempt to contact the person who filed this complaint to verify the correction of the condition or may conduct an inspection of your unit. You can check HPDONLINE to see if a
10                                                                                                                                                         The following complaint conditions are still open. HPD has already attempted to notify the property owner that the condition exists; the tenant should provide access for the owner to make the repair.  HPD may attempt to contact the tenant by phone to verify the correction of the condition or an HPD Inspector may attempt to conduct an inspection.
   Resolution.Action.Updated.Date Community.Board        BBL   Borough
1          11/23/2025 12:00:00 AM    08 MANHATTAN 1015640012 MANHATTAN
2          11/23/2025 12:00:00 AM    12 MANHATTAN 1021490265 MANHATTAN
3          11/23/2025 12:00:00 AM    12 MANHATTAN 1021490265 MANHATTAN
4          11/23/2025 12:00:00 AM    08 MANHATTAN 1015640012 MANHATTAN
5          11/23/2025 12:00:00 AM    07 MANHATTAN 1018800053 MANHATTAN
6          11/23/2025 12:00:00 AM    03 MANHATTAN 1004290009 MANHATTAN
7          11/23/2025 12:00:00 AM    12 MANHATTAN 1022350035 MANHATTAN
8          11/23/2025 12:00:00 AM    07 MANHATTAN 1011990045 MANHATTAN
9          11/23/2025 12:00:00 AM    09 MANHATTAN 1020790007 MANHATTAN
10         11/23/2025 12:00:00 AM    12 MANHATTAN 1021230100 MANHATTAN
   X.Coordinate..State.Plane. Y.Coordinate..State.Plane. Open.Data.Channel.Type
1                     998,207                    221,730                 ONLINE
2                   1,004,446                    251,683                  PHONE
3                   1,004,446                    251,683                  PHONE
4                     998,207                    221,730                 ONLINE
5                     993,732                    231,824                  PHONE
6                     987,565                    202,976                  PHONE
7                   1,005,840                    254,895                  PHONE
8                     992,387                    225,574                 MOBILE
9                     998,651                    240,613                  PHONE
10                  1,001,609                    245,101                 ONLINE
   Park.Facility.Name Park.Borough Vehicle.Type Taxi.Company.Borough
1         Unspecified    MANHATTAN           NA                   NA
2         Unspecified    MANHATTAN           NA                   NA
3         Unspecified    MANHATTAN           NA                   NA
4         Unspecified    MANHATTAN           NA                   NA
5         Unspecified    MANHATTAN           NA                   NA
6         Unspecified    MANHATTAN           NA                   NA
7         Unspecified    MANHATTAN           NA                   NA
8         Unspecified    MANHATTAN           NA                   NA
9         Unspecified    MANHATTAN           NA                   NA
10        Unspecified    MANHATTAN           NA                   NA
   Taxi.Pick.Up.Location Bridge.Highway.Name Bridge.Highway.Direction Road.Ramp
1                     NA                  NA                       NA        NA
2                     NA                  NA                       NA        NA
3                     NA                  NA                       NA        NA
4                     NA                  NA                       NA        NA
5                     NA                  NA                       NA        NA
6                     NA                  NA                       NA        NA
7                     NA                  NA                       NA        NA
8                     NA                  NA                       NA        NA
9                     NA                  NA                       NA        NA
10                    NA                  NA                       NA        NA
   Bridge.Highway.Segment Latitude Longitude
1                      NA 40.77526 -73.94961
2                      NA 40.85746 -73.92699
3                      NA 40.85746 -73.92699
4                      NA 40.77526 -73.94961
5                      NA 40.80297 -73.96575
6                      NA 40.72380 -73.98804
7                      NA 40.86628 -73.92194
8                      NA 40.78582 -73.97062
9                      NA 40.82709 -73.94796
10                     NA 40.83940 -73.93726
                                   Location CreatedDate borough_std
1   (40.77526292188198, -73.94960785720805)  2025-11-23   MANHATTAN
2   (40.85746342279737, -73.92699164532567)  2025-11-23   MANHATTAN
3   (40.85746342279737, -73.92699164532567)  2025-11-23   MANHATTAN
4   (40.77526292188198, -73.94960785720805)  2025-11-23   MANHATTAN
5   (40.80297417373674, -73.96575071974307)  2025-11-23   MANHATTAN
6   (40.72379831989205, -73.98804034230925)  2025-11-23   MANHATTAN
7   (40.86627609160888, -73.92194200381317)  2025-11-23   MANHATTAN
8   (40.78582095720245, -73.97061647397211)  2025-11-23   MANHATTAN
9     (40.827090867999, -73.94796428323792)  2025-11-23   MANHATTAN
10 (40.839403814099306, -73.93726441124402)  2025-11-23   MANHATTAN
         created_clean closed_clean created_month closed_month
1  2025-11-23 11:58:19         <NA> November 2025         <NA>
2  2025-11-23 11:55:32         <NA> November 2025         <NA>
3  2025-11-23 11:52:04         <NA> November 2025         <NA>
4  2025-11-23 11:51:49         <NA> November 2025         <NA>
5  2025-11-23 11:48:51         <NA> November 2025         <NA>
6  2025-11-23 11:42:51         <NA> November 2025         <NA>
7  2025-11-23 11:39:54         <NA> November 2025         <NA>
8  2025-11-23 11:39:46         <NA> November 2025         <NA>
9  2025-11-23 11:36:49         <NA> November 2025         <NA>
10 2025-11-23 11:34:10         <NA> November 2025         <NA>

3.6 Housing violation decription column data cleaning

Code
## Housing violation description column data cleaning

# First, let's inspect NOVDescription
head(housing$NOVDescription, 10)
 [1] "(A) § HMC:FILE ANNUAL BEDBUG REPORT IN ACCORDANCE WITH HPD RULE AS DESCRIBED ON THE BACK OF THIS NOTICE OF VIOLATION OR AS DESCRIBED ON HPDâ\u0080\u0099S WEBSITE, WWW.NYC.GOV\\HPD, SEARCH BED BUGS."                                 
 [2] "§ 27-2005 ADM CODE PAINT METAL IN ACCORDANCE WITH DEPT. REGULATION RADIATOR AT THE SOUTH WALL IN THE KITCHEN LOCATED AT APT 9C, 9th STORY, 3rd APARTMENT FROM EAST AT SOUTH"                                                           
 [3] "§ 27-2026 ADM CODE REPAIR THE LEAKY AND/OR DEFECTIVE FAUCETS AT SINK IN THE KITCHENETTE LOCATED AT B-ROOM 3N, 3rd STORY, 1st B-ROOM FROM NORTH AT EAST"                                                                                
 [4] "§ 27-2005, 27-2007, 27-2041.1 HMC: REPLACE OR REPAIR THE SELF-CLOSING DOORS THAT IS MISSING OR DEFECTIVE AT BULKHEAD"                                                                                                                  
 [5] "§ 27-2026 ADM CODE REPAIR THE LEAKY AND/OR DEFECTIVE FAUCETS AT WASHBASIN IN THE BATHROOM LOCATED AT APT 20B, 2nd STORY, 1st APARTMENT FROM NORTH AT EAST"                                                                             
 [6] "§ 27-2046.1 HMC: REPAIR OR REPLACE THE CARBON MONOXIDE DETECTING DEVICE(S). MISSING IN THE ENTIRE APARTMENT LOCATED AT APT 7, 3rd STORY, 1st APARTMENT FROM EAST AT SOUTH"                                                             
 [7] "§ 27-2045 ADM CODE REPAIR OR REPLACE THE SMOKE DETECTOR MISSING LOCATED AT APT 2A, 2nd STORY, 1st APARTMENT FROM WEST AT NORTH"                                                                                                        
 [8] "§ 27-2046.1 HMC: REPAIR OR REPLACE THE CARBON MONOXIDE DETECTING DEVICE(S). MISSING IN THE ENTIRE APARTMENT LOCATED AT APT 5C, 5th STORY, 1st APARTMENT FROM WEST AT NORTH"                                                            
 [9] "§ 27-2045 ADM CODE REPAIR OR REPLACE THE SMOKE DETECTOR MISSING IN THE ENTIRE APARTMENT LOCATED AT APT 5C, 5th STORY, 1st APARTMENT FROM WEST AT NORTH"                                                                                
[10] "§ 27-2005 ADM CODE REPAIR THE BROKEN OR DEFECTIVE PLASTERED SURFACES AND PAINT IN A UNIFORM COLOR EAST WALL APPROX 4SQ FT AND NORTH WALL APPROX 4 SQ FT IN THE BATHROOM LOCATED AT APT 1A, 1st STORY, 2nd APARTMENT FROM SOUTH AT WEST"
Code
# ========== REFINED CATEGORIZATION ==========

housing <- housing |>
  mutate(
    ViolationCategory = case_when(

      # 1 — Registration (Class I)
      Class == "I" |
        str_detect(NOVDescription, regex("registration|register", ignore_case = TRUE)) ~
        "REGISTRATION/ADMIN",

      # 2 — Smoke / CO
      str_detect(NOVDescription, regex("smoke detector|carbon monoxide|co detect", ignore_case = TRUE)) ~
        "SMOKE/CO DETECTOR",

      # 3 — Heat / Hot Water
      str_detect(NOVDescription, regex("heat|heating|hot water|radiator|boiler|steam|too cold", ignore_case = TRUE)) ~
        "HEAT/HOT WATER",

      # 4 — Plumbing
      str_detect(NOVDescription, regex("plumb|pipe|faucet|drain|sewer|bathroom|sink|toilet|water supply", ignore_case = TRUE)) ~
        "PLUMBING",

      # 5 — Water leak
      str_detect(NOVDescription, regex("leak|leaking|leakage|water drip", ignore_case = TRUE)) ~
        "WATER LEAK",

      # 6 — Paint / Plaster
      str_detect(NOVDescription, regex("paint|plaster|peel|wall|ceiling", ignore_case = TRUE)) ~
        "PAINT/PLASTER",

      # 7 — Door / Window / Lock
      str_detect(NOVDescription, regex("door|window|lock|self-closing|entrance", ignore_case = TRUE)) ~
        "DOOR/WINDOW/LOCK",

      # 8 — Pest / Sanitation
      str_detect(NOVDescription, regex("rodent|pest|roach|mice|rat|garbage|infest|sanitation", ignore_case = TRUE)) ~
        "PEST/SANITATION",

      # 9 — Floor / Ceiling
      str_detect(NOVDescription, regex("floor|tile|carpet|wood floor|ceramic", ignore_case = TRUE)) ~
        "FLOOR/CEILING",

      # 10 — Elevator
      str_detect(NOVDescription, regex("elevator|lift", ignore_case = TRUE)) ~
        "ELEVATOR",

      # 11 — Mold
      str_detect(NOVDescription, regex("mold|mildew|moisture", ignore_case = TRUE)) ~
        "MOLD",

      # 12 — Electrical
      str_detect(NOVDescription, regex("electric|wiring|outlet|light|circuit", ignore_case = TRUE))  ~ "ELECTRICAL",

      # 13 — Gas / Appliances
      str_detect(NOVDescription, regex("gas|appliance|stove", ignore_case = TRUE)) ~
        "GAS/APPLIANCES",

      # 14 — Ventilation
      str_detect(NOVDescription, regex("ventilat|airflow|exhaust fan", ignore_case = TRUE)) ~
        "VENTILATION",

      # 15— Fire Safety
      str_detect(NOVDescription, regex("fire|sprinkler|fire escape|extinguish", ignore_case = TRUE)) ~
        "FIRE SAFETY",

      # 16— Intercom / Bell
      str_detect(NOVDescription, regex("bell|buzzer|intercom", ignore_case = TRUE)) ~
        "BUILDING SYSTEMS",

      # 17 — Building Management
      str_detect(NOVDescription, regex("janitor|superintendent|building service|super", ignore_case = TRUE)) ~
        "BUILDING MANAGEMENT",

      # Default
      TRUE ~ "OTHER"
    )
  )

# Check distribution of newly created categories
housing |>
  count(ViolationCategory, sort = TRUE) |>
  print()
     ViolationCategory      n
1        PAINT/PLASTER 127336
2             PLUMBING  93154
3     DOOR/WINDOW/LOCK  91860
4                OTHER  53533
5      PEST/SANITATION  50753
6    SMOKE/CO DETECTOR  40227
7       HEAT/HOT WATER  39990
8   REGISTRATION/ADMIN  35133
9           WATER LEAK  23370
10       FLOOR/CEILING  19633
11         FIRE SAFETY   7121
12      GAS/APPLIANCES   4004
13          ELECTRICAL   1607
14    BUILDING SYSTEMS    679
15 BUILDING MANAGEMENT    376
16                MOLD    118
17            ELEVATOR     81
18         VENTILATION     30
Code
# ========== 311: CATEGORY MAPPING (parallel to housing) ==========

# Compare with 311 Complaint Types
sr311 |>
  count(Complaint.Type, sort = TRUE) |>
  print()
                  Complaint.Type      n
1                 HEAT/HOT WATER 229222
2                       PLUMBING  50637
3                  PAINT/PLASTER  44624
4                     WATER LEAK  30190
5  General Construction/Plumbing  25207
6        Maintenance or Facility  20202
7                       Elevator  16588
8                       Plumbing   2197
9                       ELEVATOR   1302
10          Non-Residential Heat   1094
11            School Maintenance   1064
12              OUTSIDE BUILDING    857
13                    Water Leak    556
14                Heat/Hot Water    487
15                 Paint/Plaster    404
16                          Mold    402
17  Building Drinking Water Tank     83
18             Sewer Maintenance     68
19             Water Maintenance     31
20            Building Condition     19
21             Unstable Building     14
Code
# Mapped categories
sr311 <- sr311 |>
  mutate(
    Category = case_when(
      # Heat / Hot Water
      str_detect(`Complaint.Type`, regex("HEAT|HOT WATER", ignore_case = TRUE)) ~ "HEAT/HOT WATER",
      
      # Plumbing
      str_detect(`Complaint.Type`, regex("PLUMBING", ignore_case = TRUE)) ~ "PLUMBING",
      
      # Water Leak
      str_detect(`Complaint.Type`, regex("WATER LEAK", ignore_case = TRUE)) ~ "WATER LEAK",
      
      # Elevator
      str_detect(`Complaint.Type`, regex("ELEVATOR", ignore_case = TRUE)) ~ "ELEVATOR",
      
      # Mold
      str_detect(`Complaint.Type`, regex("MOLD", ignore_case = TRUE)) ~ "MOLD",
      
      # PAINT / PLASTER — newly added
      str_detect(`Complaint.Type`, regex("PAINT|PLASTER", ignore_case = TRUE)) ~ "PAINT/PLASTER",
      
      # Pest / sanitation based on Complaint.Type AND Descriptor
      (
        str_detect(`Complaint.Type`, regex("MAINTENANCE|FACILITY", ignore_case = TRUE)) &
        str_detect(Descriptor, regex("rodent|rodents|mice|rats|mouse|rat|insect|pest", ignore_case = TRUE))
      ) ~ "PEST/SANITATION",
      
      # Fallback
      TRUE ~ "OTHER"
    )
  )

# New mapped categories
sr311 |>
  count(Category, sort = TRUE) |>
  print()
         Category      n
1  HEAT/HOT WATER 230803
2        PLUMBING  78041
3   PAINT/PLASTER  45028
4      WATER LEAK  30746
5        ELEVATOR  17890
6           OTHER  17784
7 PEST/SANITATION   4554
8            MOLD    402

3.7 Save cleaned CSV copies for reproducibility

Code
write.csv(housing, "datasets/housing_manhattan_3years_clean.csv", row.names = FALSE)
write.csv(sr311,   "datasets/sr311_manhattan_3years_clean.csv", row.names = FALSE)

3.8 CATEGORY COMPARISON: Housing vs 311

Code
comparison_categories <- c(
  "HEAT/HOT WATER", "PLUMBING", "PAINT/PLASTER",
  "WATER LEAK", "ELEVATOR", "MOLD", "PEST/SANITATION"
)

# Violations count
viol_comparison <- housing |>
  filter(ViolationCategory %in% comparison_categories) |>
    group_by(ViolationCategory) |>
      summarise(Count = n(), .groups = "drop") |>
        mutate(
          Category = ViolationCategory,
          Source = "Housing Violations"
        ) |>
          select(Category, Count, Source)

# 311 count
complaints_comparison <- sr311 |>
  filter(Category %in% comparison_categories) |>
    group_by(Category) |>
      summarise(Count = n(), .groups = "drop") |>
        mutate(Source = "311 Complaints")

# Combine both
combined <- bind_rows(viol_comparison, complaints_comparison)

# Plot
ggplot(combined, aes(x = reorder(Category, -Count), y = Count, fill = Source)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_text(aes(label = scales::comma(Count)),
            position = position_dodge(width = 0.9),
            vjust = -0.5, size = 4) +
  scale_fill_manual(values = c(
    "Housing Violations" = "purple",
    "311 Complaints"     = "pink"
  )) +
  scale_y_continuous(labels = scales::comma,
                     expand = expansion(mult = c(0, 0.15))) +
  labs(
    title = "Violations vs 311 Complaints by Category",
    subtitle = "Manhattan, 2022–2025 — Comparable Categories Only",
    x = "Category",
    y = "Count",
    fill = "Source"
  ) +
  theme_minimal(base_size = 16) +
  theme(
    plot.title = element_text(face = "bold"),
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "top"
  )

3.9 Missing Data summary:

Code
# === HOUSING VIOLATIONS ===

# Calculate missing counts and percentages
missing_housing <- housing |>
  summarise(across(everything(), ~ sum(is.na(.)))) |>
    pivot_longer(everything(), names_to = "column", values_to = "missing_count") |>
      mutate(total = nrow(housing), missing_pct = round(100 * missing_count / total, 2)) |>
        arrange(desc(missing_pct))


# Top housing missing columns (sample)
print(head(missing_housing, 20))
# A tibble: 20 × 4
   column                      missing_count  total missing_pct
   <chr>                               <int>  <int>       <dbl>
 1 newcertifybydate_clean             585410 589005       99.4 
 2 newcorrectbydate_cean              585410 589005       99.4 
 3 certifieddate_clean                360194 589005       61.2 
 4 Story                               77094 589005       13.1 
 5 NOVID                               34593 589005        5.87
 6 novissued_clean                     34593 589005        5.87
 7 originalcertifybydate_clean         34593 589005        5.87
 8 originalcorrectbydate_clean         34593 589005        5.87
 9 nov_month                           34593 589005        5.87
10 BIN                                   305 589005        0.05
11 BBL                                   305 589005        0.05
12 Postcode                              147 589005        0.02
13 Latitude                               77 589005        0.01
14 Longitude                              77 589005        0.01
15 CommunityBoard                         77 589005        0.01
16 CouncilDistrict                        77 589005        0.01
17 CensusTract                            77 589005        0.01
18 ViolationID                             0 589005        0   
19 BuildingID                              0 589005        0   
20 RegistrationID                          0 589005        0   
Code
# Columns with missing values more than 0%
print(filter(missing_housing, missing_pct > 0))
# A tibble: 17 × 4
   column                      missing_count  total missing_pct
   <chr>                               <int>  <int>       <dbl>
 1 newcertifybydate_clean             585410 589005       99.4 
 2 newcorrectbydate_cean              585410 589005       99.4 
 3 certifieddate_clean                360194 589005       61.2 
 4 Story                               77094 589005       13.1 
 5 NOVID                               34593 589005        5.87
 6 novissued_clean                     34593 589005        5.87
 7 originalcertifybydate_clean         34593 589005        5.87
 8 originalcorrectbydate_clean         34593 589005        5.87
 9 nov_month                           34593 589005        5.87
10 BIN                                   305 589005        0.05
11 BBL                                   305 589005        0.05
12 Postcode                              147 589005        0.02
13 Latitude                               77 589005        0.01
14 Longitude                              77 589005        0.01
15 CommunityBoard                         77 589005        0.01
16 CouncilDistrict                        77 589005        0.01
17 CensusTract                            77 589005        0.01
Code
# Columns with missing values more than 50%
print(missing_housing[missing_housing$missing_pct > 50, ])
# A tibble: 3 × 4
  column                 missing_count  total missing_pct
  <chr>                          <int>  <int>       <dbl>
1 newcertifybydate_clean        585410 589005        99.4
2 newcorrectbydate_cean         585410 589005        99.4
3 certifieddate_clean           360194 589005        61.2
Code
# === 311 COMPLAINTS ===

# Calculate missing counts and percentages
missing_311 <- sr311 |>
  summarise(across(everything(), ~ sum(is.na(.)))) |>
    pivot_longer(everything(), names_to = "column", values_to = "missing_count") |>
      mutate(total = nrow(sr311), missing_pct = round(100 * missing_count / total, 2)) |>
        arrange(desc(missing_pct))


# Top 311 missing columns (sample)
print(head(missing_311, 20))
# A tibble: 20 × 4
   column                   missing_count  total missing_pct
   <chr>                            <int>  <int>       <dbl>
 1 Due.Date                        425248 425248      100   
 2 Vehicle.Type                    425248 425248      100   
 3 Taxi.Company.Borough            425248 425248      100   
 4 Taxi.Pick.Up.Location           425248 425248      100   
 5 Bridge.Highway.Name             425248 425248      100   
 6 Bridge.Highway.Direction        425248 425248      100   
 7 Road.Ramp                       425248 425248      100   
 8 Bridge.Highway.Segment          425248 425248      100   
 9 closed_clean                      5151 425248        1.21
10 closed_month                      5151 425248        1.21
11 BBL                               2302 425248        0.54
12 Unique.Key                           0 425248        0   
13 Created.Date                         0 425248        0   
14 Closed.Date                          0 425248        0   
15 Agency                               0 425248        0   
16 Agency.Name                          0 425248        0   
17 Complaint.Type                       0 425248        0   
18 Descriptor                           0 425248        0   
19 Location.Type                        0 425248        0   
20 Incident.Zip                         1 425248        0   
Code
# Columns with >0% missing
print(filter(missing_311, missing_pct > 0))
# A tibble: 11 × 4
   column                   missing_count  total missing_pct
   <chr>                            <int>  <int>       <dbl>
 1 Due.Date                        425248 425248      100   
 2 Vehicle.Type                    425248 425248      100   
 3 Taxi.Company.Borough            425248 425248      100   
 4 Taxi.Pick.Up.Location           425248 425248      100   
 5 Bridge.Highway.Name             425248 425248      100   
 6 Bridge.Highway.Direction        425248 425248      100   
 7 Road.Ramp                       425248 425248      100   
 8 Bridge.Highway.Segment          425248 425248      100   
 9 closed_clean                      5151 425248        1.21
10 closed_month                      5151 425248        1.21
11 BBL                               2302 425248        0.54
Code
# Columns with >50% missing
print(filter(missing_311, missing_pct > 50))
# A tibble: 8 × 4
  column                   missing_count  total missing_pct
  <chr>                            <int>  <int>       <dbl>
1 Due.Date                        425248 425248         100
2 Vehicle.Type                    425248 425248         100
3 Taxi.Company.Borough            425248 425248         100
4 Taxi.Pick.Up.Location           425248 425248         100
5 Bridge.Highway.Name             425248 425248         100
6 Bridge.Highway.Direction        425248 425248         100
7 Road.Ramp                       425248 425248         100
8 Bridge.Highway.Segment          425248 425248         100
Code
# Saving results for visualization
write.csv(missing_housing, "datasets/missing_values_violations.csv", row.names = FALSE)
write.csv(missing_311, "datasets/missing_values_311.csv", row.names = FALSE)

3.9.1 Missing Value Analysis

3.10 Missing Visualizations

Code
# --------- Visualizations: Missingness bar charts (tidy) ---------

# Load the saved summaries (if needed) or use objects directly

missing_viol_plot <- missing_housing |> filter(missing_pct > 0)
missing_311_plot  <- missing_311 |> filter(missing_pct > 0)

# ========== GRAPH 1: BAR CHART - HOUSING VIOLATIONS ==========

p1 <- ggplot(missing_viol_plot, aes(x = reorder(column, missing_pct), y = missing_pct)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(
  title = "Missing Values in Housing Violations Dataset",
  subtitle = "Manhattan, 2022-2025",
  x = "Column Name",
  y = "Percentage Missing (%)"
  ) +
  theme_minimal(base_size = 14) +
  theme(
  plot.title = element_text(face = "bold", size = 16),
  axis.text.y = element_text(size = 11)
  )

print(p1)

Code
# ========== GRAPH 2: BAR CHART - 311 COMPLAINTS ==========

p2 <- ggplot(missing_311_plot, aes(x = reorder(column, missing_pct), y = missing_pct)) +
  geom_bar(stat = "identity", fill = "coral") +
  coord_flip() +
  labs(
  title = "Missing Values in 311 Housing Complaints Dataset",
  subtitle = "Manhattan, 2022-2025",
  x = "Column Name",
  y = "Percentage Missing (%)"
  ) +
  theme_minimal(base_size = 14) +
  theme(
  plot.title = element_text(face = "bold", size = 16),
  axis.text.y = element_text(size = 11)
  )

print(p2)

3.11 MISSING DATA SUMMARY TABLES

3.11.1 Housing Violations – Missingness Summary Table

Code
missing_summary_housing <- tibble(
  Metric = c(
  "Total Columns",
  "Columns > 0% Missing",
  "Columns > 50% Missing",
  "Columns == 100% Missing",
  "Columns == 0% Missing"
  ),
  Value = c(
    nrow(missing_housing),
    nrow(filter(missing_housing, missing_pct > 0)),
    nrow(filter(missing_housing, missing_pct > 50)),
    nrow(filter(missing_housing, missing_pct == 100)),
    nrow(filter(missing_housing, missing_pct == 0))
  )
)

missing_summary_housing
# A tibble: 5 × 2
  Metric                  Value
  <chr>                   <int>
1 Total Columns              56
2 Columns > 0% Missing       17
3 Columns > 50% Missing       3
4 Columns == 100% Missing     0
5 Columns == 0% Missing      39

3.11.2 311 Complaints – Missingness Summary Table

Code
missing_summary_311 <- tibble(
  Metric = c(
  "Total Columns",
  "Columns > 0% Missing",
  "Columns > 50% Missing",
  "Columns == 100% Missing",
  "Columns == 0% Missing"
  ),
  Value = c(
    nrow(missing_311),
    nrow(filter(missing_311, missing_pct > 0)),
    nrow(filter(missing_311, missing_pct > 50)),
    nrow(filter(missing_311, missing_pct == 100)),
    nrow(filter(missing_311, missing_pct == 0))
  )
)

missing_summary_311
# A tibble: 5 × 2
  Metric                  Value
  <chr>                   <int>
1 Total Columns              48
2 Columns > 0% Missing       11
3 Columns > 50% Missing       8
4 Columns == 100% Missing     8
5 Columns == 0% Missing      37

3.12 MISSING VALUE HEATMAPS

Code
# Simple aggregated missing plot
plot_missing(housing, percent = TRUE, num_char = 5, max_cols = 10)  
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.

Code
plot_missing(sr311, percent = TRUE, num_char = 5, max_cols = 10) 
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
Warning: Removed 8 rows containing missing values or values outside the scale range
(`geom_col()`).

Code
library(redav)

# ========== HOUSING VIOLATIONS - FIXED ==========
plot_missing(housing,
             percent = TRUE,
             num_char = 5, max_cols = 10)  # Shorten column names to 5 characters
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.

Code
# Alternative: Manually create cleaner version
# First, let's see how many columns we're dealing with
cat("Number of columns:", ncol(housing))
Number of columns: 56

3.13 3-year per-quarter faceted plots and combined panel (these come right AFTER the category comparison block)

Code
# ----- HOUSING: QUARTERLY CATEGORY TRENDS (3-LINE-QTRS) -----

housing_quarter_lines <- housing |>
  filter(!is.na(inspectiondate_clean)) |>
    mutate(
      year = year(inspectiondate_clean),
      quarter = paste0("Q", quarter(inspectiondate_clean))
    ) |>
      count(year, quarter, ViolationCategory) |>
        mutate(
          quarter = factor(quarter, levels = c("Q1","Q2","Q3","Q4"))
        )

ggplot(housing_quarter_lines,
       aes(x = quarter, y = n, color = factor(year), group = year)) +
  geom_line(linewidth = 1.2) +
  geom_point(size = 2.2) +
  scale_y_continuous(labels = scales::comma) +
  scale_color_manual(values = c("2023" = "#E63946",
                                "2024" = "#457B9D",
                                "2025" = "#2A9D8F"),
                     name = "Year") +
  facet_wrap(~ ViolationCategory, scales = "free_y") +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold"),
    axis.text.x = element_text(angle = 0),
    legend.position = "top"
  ) +
  labs(
  title = "Housing Violations by Category Across Quarters",
  subtitle = "Manhattan (2023–2025) • 3-Year Trend",
  x = "Quarter",
  y = "Number of Violations"
)

Code
# ----- 311: QUARTERLY CATEGORY TRENDS (3-LINE-QTRS) -----

sr311_quarter_lines <- sr311 |>
  filter(!is.na(created_clean)) |>
    mutate(
      year = year(created_clean),
      quarter = paste0("Q", quarter(created_clean))
    ) |>
      count(year, quarter, Category) |>
      mutate(
        quarter = factor(quarter, levels = c("Q1","Q2","Q3","Q4"))
      )

ggplot(sr311_quarter_lines,
  aes(x = quarter, y = n, color = factor(year), group = year)) +
  geom_line(linewidth = 1.2) +
  geom_point(size = 2.2) +
  scale_y_continuous(labels = scales::comma) +
  scale_color_manual(values = c("2023" = "#E63946",
  "2024" = "#457B9D",
  "2025" = "#2A9D8F"),
  name = "Year") +
  facet_wrap(~ Category, scales = "free_y") +
  theme_minimal(base_size = 14) +
  theme(
  plot.title = element_text(face = "bold"),
  axis.text.x = element_text(angle = 0),
  legend.position = "top"
  ) +
  labs(
  title = "311 Housing Complaints by Category Across Quarters",
  subtitle = "Manhattan (2023–2025) • 3-Year Trend",
  x = "Quarter",
  y = "Number of Complaints"
)

Code
# ----- COMBINED HOUSING vs 311: QUARTERLY CATEGORY TRENDS -----

# Housing quarterly counts

housing_q <- housing |>
  filter(!is.na(inspectiondate_clean)) |>
    mutate(
    year = year(inspectiondate_clean),
    quarter = paste0("Q", quarter(inspectiondate_clean))
    ) |>
      count(year, quarter, ViolationCategory) |>
        rename(Category = ViolationCategory) |>
          mutate(Source = "Housing Violations")

# 311 quarterly counts

sr311_q <- sr311 |>
filter(!is.na(created_clean)) |>
  mutate(
    year = year(created_clean),
    quarter = paste0("Q", quarter(created_clean))
  ) |>
    count(year, quarter, Category) |>
      mutate(Source = "311 Complaints")

# Combine

combined_q <- bind_rows(housing_q, sr311_q) |>
  mutate(
    quarter = factor(quarter, levels = c("Q1", "Q2", "Q3", "Q4")),
    year = factor(year)
  ) |>
    filter(Category %in% comparison_categories)  # keep only the 7 categories

ggplot(combined_q,
  aes(x = quarter, y = n, color = Source, group = Source)) +
  geom_line(linewidth = 1.2) +
  geom_point(size = 2) +
  facet_wrap(~ Category, scales = "free_y") +
  scale_color_manual(values = c(
  "Housing Violations" = "#D62828",
  "311 Complaints" = "#1D3557"
  )) +
  scale_y_continuous(labels = scales::comma) +
  theme_minimal(base_size = 14) +
  theme(
  legend.position = "top",
  plot.title = element_text(face = "bold")
  ) +
  labs(
  title = "Combined Quarterly Trends: Housing Violations vs 311 Complaints",
  subtitle = "Manhattan (2023–2025) • Faceted by Category",
  x = "Quarter",
  y = "Count",
  color = "Source"
)

3.14 Housing Violation Monthly Transition Alluvial

Code
# ---- Housing Violation Monthly Transition Alluvial ----
# Uses cleaned date columns created earlier: inspectiondate_clean, novissued_clean, statusdate_clean


viol_flow <- housing |>
  # select the cleaned date cols we created earlier (keep original cols untouched)
  select(ViolationID, Class, inspectiondate_clean, novissued_clean, statusdate_clean) |>
  # convert dates to YYYY-MM character format for clearer alluvial axes
  mutate(
    insp_m  = ifelse(!is.na(inspectiondate_clean),
                     format(floor_date(inspectiondate_clean, "month"), "%Y-%m"),
                     NA_character_),
    nov_m   = ifelse(!is.na(novissued_clean),
                     format(floor_date(novissued_clean, "month"), "%Y-%m"),
                     NA_character_),
    stat_m  = ifelse(!is.na(statusdate_clean),
                     format(floor_date(statusdate_clean, "month"), "%Y-%m"),
                     NA_character_)
  ) |>
  # require an inspection + at least one later stage
  filter(!is.na(insp_m) & ( !is.na(nov_m) | !is.na(stat_m) )) |>
  group_by(insp_m, nov_m, stat_m, Class) |>
  summarise(n = n(), .groups = "drop")

# Quick check
print(head(viol_flow, 6))
# A tibble: 6 × 5
  insp_m  nov_m   stat_m  Class     n
  <chr>   <chr>   <chr>   <chr> <int>
1 2022-01 2022-01 2022-01 A       115
2 2022-01 2022-01 2022-01 B       341
3 2022-01 2022-01 2022-01 C       225
4 2022-01 2022-01 2022-02 A         3
5 2022-01 2022-01 2022-02 B        72
6 2022-01 2022-01 2022-02 C       331
Code
# plot
ggplot(viol_flow,
       aes(axis1 = insp_m, axis2 = nov_m, axis3 = stat_m, y = n)) +
  geom_alluvium(aes(fill = Class), width = 0.25, alpha = 0.85) +
  geom_stratum(width = 0.25) +
  geom_text(stat = "stratum",
            aes(label = after_stat(stratum)),
            size = 2.5) +
  theme_minimal() +
  labs(
    title = "Housing Violations Flow Across Dates (Monthly): Inspection → NOV → Status",
    x = "Month",
    y = "Count"
  ) +
  theme(
    axis.text.x = element_text(size = 8, angle = 45, hjust = 1),
    plot.title = element_text(face = "bold")
  )
Warning in to_lodes_form(data = data, axes = axis_ind, discern =
params$discern): Some strata appear at multiple axes.
Warning in to_lodes_form(data = data, axes = axis_ind, discern =
params$discern): Some strata appear at multiple axes.
Warning in to_lodes_form(data = data, axes = axis_ind, discern =
params$discern): Some strata appear at multiple axes.

3.14.1 Time Series Daily

Code
housing |>
  filter(!is.na(inspectiondate_clean)) |>
  count(inspectiondate_clean) |>
  ggplot(aes(x = inspectiondate_clean, y = n)) +
  geom_line(color = "red") +
  labs(title = "Housing Violations Timeline (daily, Manhattan, 2023-2025)",
  x = "Inspection date", y = "Count") +
  theme_minimal()

Code
sr311 |>
  filter(!is.na(created_clean)) |>
  count(created_clean) |>
  ggplot(aes(x = created_clean, y = n)) +
  geom_line(color = "orange") +
  labs(title = "311 Requests Timeline (daily, Manhattan, 2023-2025)",
  x = "Created date", y = "Count") +
  theme_minimal()

3.15 Monthly combined time series (line)

Code
housing_month <- housing |>
  filter(!is.na(inspectiondate_clean)) |>
  mutate(month = floor_date(inspectiondate_clean, "month")) |>
  count(month, name = "HousingCount")

sr311_month <- sr311 |>
  filter(!is.na(created_clean)) |>
  mutate(month = floor_date(created_clean, "month")) |>
  count(month, name = "ComplaintCount")

monthly_combined <- full_join(housing_month, sr311_month, by = "month") |>
  replace_na(list(HousingCount = 0, ComplaintCount = 0))

monthly_combined |>
  pivot_longer(cols = c(HousingCount, ComplaintCount), names_to = "Source", values_to = "Count") |>
  ggplot(aes(x = month, y = Count, color = Source)) +
  geom_line(size = 1) +
  geom_point(size = 1.5) +
  scale_x_date(date_labels = "%b %Y", date_breaks = "1 month") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Monthly: Housing Violations vs 311 Complaints (Manhattan)",
  x = "Month", y = "Count")
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

3.16 Top 15 311 Complaint Types

Code
sr311 |>
  filter(!is.na(Complaint.Type)) |> 
  count(Complaint.Type) |>
  slice_max(n, n = 15) |>
  ggplot(aes(x = reorder(Complaint.Type, n), y = n)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(title = "Top 15 311 Complaints (Manhattan, 3 Years)", x = "", y = "Count") +
  theme_minimal()

3.17 Top 20 streets by housing violations

Code
housing |>
filter(!is.na(StreetName)) |>
count(StreetName) |>
slice_max(n, n = 20) |>
ggplot(aes(x = reorder(StreetName, n), y = n)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(title = "Top 20 Manhattan Streets by Housing Violations (2023-2025)",
x = "Street", y = "Count") +
theme_minimal()

3.18 Top 20 ZIPs for 311

Code
sr311 |>
filter(!is.na(Incident.Zip)) |>
count(Incident.Zip) |>
slice_max(n, n = 20) |>
ggplot(aes(x = reorder(Incident.Zip, n), y = n)) +
geom_col(fill = "purple") +
coord_flip() +
labs(title = "Top 20 ZIP Codes by 311 Complaints (Manhattan, 2023-2025)",
x = "ZIP", y = "Count") +
theme_minimal()

3.19 Housing Alluvial: Class -> Status (3 Years)

Code
housing |>
filter(!is.na(Class), !is.na(CurrentStatusID)) |>
count(Class, CurrentStatusID) |>
ggplot(aes(axis1 = Class, axis2 = CurrentStatusID, y = n)) +
geom_alluvium(aes(fill = Class), width = 0.25) +
geom_stratum(width = 0.25) +
geom_text(stat = "stratum", aes(label = after_stat(stratum)), size = 3) +
theme_minimal() +
labs(title = "Housing Violations: Class → Current Status (3 Years)", y = "Count")

3.20 Alluvial: Class -> NOV Type

Code
housing |>
filter(!is.na(Class), !is.na(NovType)) |>
count(Class, NovType) |>
ggplot(aes(axis1 = Class, axis2 = NovType, y = n)) +
geom_alluvium(aes(fill = Class), width = 0.25) +
geom_stratum(width = 0.25) +
geom_text(stat = "stratum", aes(label = after_stat(stratum)), size = 3) +
theme_minimal() +
labs(title = "Housing Violations: Class → NOV Type (3 Years)", y = "Count")

3.21 Spatial sample plots

Code
housing |>
  filter(!is.na(Longitude) & !is.na(Latitude)) |>
  slice_sample(n = 5000) |>
  ggplot(aes(x = Longitude, y = Latitude)) +
  geom_point(alpha = 0.4, size = 0.6) +
  theme_minimal() +
  labs(title = "Housing Violations Locations (sample 5k)", x = "Longitude", y = "Latitude")

Code
sr311 |>
  filter(!is.na(Longitude) & !is.na(Latitude)) |>
  slice_sample(n = 5000) |>
  ggplot(aes(x = Longitude, y = Latitude)) +
  geom_point(alpha = 0.4, size = 0.6, color = "orange") +
  theme_minimal() +
  labs(title = "311 Complaints Locations (sample 5k)", x = "Longitude", y = "Latitude")

3.22 Top 15 Housing Classes

Code
housing |>
  filter(!is.na(Class)) |>
  count(Class) |>
  slice_max(n, n = 15) |>
  ggplot(aes(x = reorder(Class, n), y = n)) +
  geom_col(fill = "darkred") +
  coord_flip() +
  labs(title = "Top 15 Housing Violation Classes (Manhattan, 2023-2025)", x = "Class", y = "Count") +
  theme_minimal()

Code
# build paths using the month text columns we created earlier (inspection_month, nov_month, status_month)

viol_path_counts <- housing |>
select(ViolationID, Class, inspection_month, nov_month, status_month) |>

# require inspection month and at least one later stage (nov or status)

filter(!is.na(inspection_month) & ( !is.na(nov_month) | !is.na(status_month) )) |>
group_by(inspection_month, nov_month, status_month, Class) |>
summarise(n = n(), .groups = "drop") |>
arrange(desc(n))

# Quick view

print(head(viol_path_counts, 8))
# A tibble: 8 × 5
  inspection_month nov_month     status_month  Class     n
  <chr>            <chr>         <chr>         <chr> <int>
1 June 2025        June 2025     June 2025     A      6411
2 November 2025    <NA>          November 2025 I      5666
3 April 2024       April 2024    April 2024    A      5505
4 March 2023       March 2023    March 2023    A      5340
5 February 2022    February 2022 February 2022 A      4991
6 October 2025     October 2025  October 2025  B      4026
7 November 2022    <NA>          November 2022 I      3529
8 August 2025      August 2025   August 2025   B      3280
Code
# Alluvial with 3 axes (InspectionMonth -> NOVMonth -> StatusMonth)

ggplot(viol_path_counts,
aes(axis1 = inspection_month, axis2 = nov_month, axis3 = status_month, y = n)) +
geom_alluvium(aes(fill = Class), width = 0.25, alpha = 0.85) +
geom_stratum(width = 0.25) +
geom_text(stat = "stratum", aes(label = after_stat(stratum)), size = 2.5) +
theme_minimal() +
labs(title = "Housing Violations Flow (Monthly): Inspection → NOV → Final Status",
x = "Stage (Month Year)", y = "Count") +
theme(axis.text.x = element_text(size = 9, angle = 45, hjust = 1))
Warning in to_lodes_form(data = data, axes = axis_ind, discern =
params$discern): Some strata appear at multiple axes.
Warning in to_lodes_form(data = data, axes = axis_ind, discern =
params$discern): Some strata appear at multiple axes.
Warning in to_lodes_form(data = data, axes = axis_ind, discern =
params$discern): Some strata appear at multiple axes.

4 Time series line chart

Code
set.seed(2025)

# Load data
housing_2 <- as.data.table(housing)
  # fread("datasets/Housing_Violations_2022_onwards.csv")

# Convert date and extract time components
housing_2[, InspectionDate := as.Date(InspectionDate)]
housing_2[, YearMonth := format(InspectionDate, "%Y-%m")]
housing_2[, Month := month(InspectionDate)]
housing_2[, Year := year(InspectionDate)]

# Count violations by month and year
viol_by_month <- housing_2[, .N, by = .(Year, Month)][order(Year, Month)]

# Create the plot
ggplot(viol_by_month, aes(x = Month, y = N, color = factor(Year), group = Year)) +
  geom_line(linewidth = 1.2) +
  geom_point(size = 2.5) +
  scale_x_continuous(breaks = 1:12, 
                     labels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
                                "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")) +
  scale_y_continuous(labels = scales::comma) +
  labs(
    title = "Housing Violations by Month and Year",
    subtitle = "Manhattan, 2022-2025",
    x = "Month",
    y = "Number of Violations",
    color = "Year"
  ) +
  theme_minimal(base_size = 16) +
  theme(
    plot.title = element_text(face = "bold"),
    legend.position = "right"
  )

Code
# Investigate February 2022
feb_2022 <- housing_2[Year == 2022 & Month == 2]
cat("February 2022 violations:", nrow(feb_2022), "\n")
February 2022 violations: 28414 
Code
# Check if they're all inspected in Feb or entered in Feb
summary(feb_2022$InspectionDate)
        Min.      1st Qu.       Median         Mean      3rd Qu.         Max. 
"2022-02-01" "2022-02-15" "2022-02-15" "2022-02-15" "2022-02-16" "2022-02-28" 
Code
summary(feb_2022$ApprovedDate)
   Length     Class      Mode 
    28414 character character 

Median/Mean inspection date is February 15, 2022 and Most inspections are clustered around Feb 15-16. This is NOT a data error but this appears to be a mass inspection event or systematic sweep by HPD in mid-February 2022.

4.2 What Types of Violations Are Most Common? Research Question: What are the most common housing violations that tenants and inspectors encounter?

4.3 Violation Severity and Status Research Question: What types of violations are most common, and how seriously are they being addressed?

Code
# === GRAPH 3A: Violation Class Distribution ===
# Class definitions: A=Non-Hazardous, B=Hazardous, C=Immediately Hazardous, I=Failure to Register

class_data <- housing_2[, .N, by=Class][order(-N)]

# Add descriptive labels
class_data[, ClassLabel := fcase(
  Class == "A", "Class A: Non-Hazardous",
  Class == "B", "Class B: Hazardous",
  Class == "C", "Class C: Immediately Hazardous",
  Class == "I", "Class I: Failure to Register",
  default = Class
)]

ggplot(class_data, aes(x = reorder(ClassLabel, N), y = N, fill = Class)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = scales::comma(N)), hjust = -0.1, size = 5) +
  coord_flip() +
  scale_y_continuous(labels = scales::comma, expand = expansion(mult = c(0, 0.15))) +
  labs(
    title = "Housing Violations by Severity Class",
    subtitle = "Manhattan, 2022-2025",
    x = "Violation Class",
    y = "Number of Violations",
    caption = "Class A: Non-hazardous | Class B: Hazardous | Class C: Immediately Hazardous"
  ) +
  theme_minimal(base_size = 16) +
  theme(
    plot.title = element_text(face = "bold"),
    legend.position = "none"
  )

Code
# === GRAPH 3B: Rent-Impairing Violations ===

rent_data <- housing_2[, .N, by=RentImpairing]
rent_data[, Label := ifelse(RentImpairing == "Y", 
                             "Rent-Impairing\n(affects habitability)", 
                             "Non-Rent-Impairing")]

ggplot(rent_data, aes(x = Label, y = N, fill = RentImpairing)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = paste0(scales::comma(N), "\n(", 
                                round(N/sum(N)*100, 1), "%)")), 
            vjust = -0.5, size = 5) +
  scale_fill_manual(values = c("N" = "steelblue", "Y" = "darkred")) +
  scale_y_continuous(labels = scales::comma, expand = expansion(mult = c(0, 0.1))) +
  labs(
    title = "Rent-Impairing vs Non-Rent-Impairing Violations",
    subtitle = "Manhattan, 2022-2025",
    x = "",
    y = "Number of Violations"
  ) +
  theme_minimal(base_size = 16) +
  theme(
    plot.title = element_text(face = "bold"),
    legend.position = "none"
  )

4.4 Save cleaned RDS as backups

Code
saveRDS(housing, file = "housing_manhattan_3years_clean.rds")
saveRDS(sr311,   file = "sr311_manhattan_3years_clean.rds")